(3) Modeling Topic Shifts

Author

Helen Schmidt

Modified

January 28, 2025

Setup

# check for colorblind-friendly palettes
MetBrewer::colorblind_met_palettes
 [1] "Archambault" "Cassatt1"    "Cassatt2"    "Demuth"      "Derain"     
 [6] "Egypt"       "Greek"       "Hiroshige"   "Hokusai2"    "Hokusai3"   
[11] "Ingres"      "Isfahan1"    "Isfahan2"    "Java"        "Johnson"    
[16] "Kandinsky"   "Morgenstern" "OKeeffe1"    "OKeeffe2"    "Pillement"  
[21] "Tam"         "Troy"        "VanGogh3"    "Veronese"   

1. Jaccard Similarity

# load all participant raw data and select just vars of interest
jaccard_df <- read.csv("./data/processed/jaccard_data.csv") |>
  select(PID, transcript_id, new_topic, scaled_turn_id)
# reshape data to get list of topic switches for each PID
# should be 1505 aka number of participants
jaccard_df <- jaccard_df |>
  filter(!is.na(new_topic)) |>
  select(PID, transcript_id, scaled_turn_id) |>
  group_by(PID, transcript_id) |>
  summarize(topic_switch_turns = list(scaled_turn_id), .groups = "drop")
# define jaccard similarity function
# window of 3 turns
jaccard_similarity_3 <- function(turns_A, turns_B) {
  # create sets of turns within 3-turn window
  set_A <- unlist(lapply(turns_A, function(x) seq(x - 3, x + 3, by = 1)))
  set_B <- unlist(lapply(turns_B, function(x) seq(x - 3, x + 3, by = 1)))
  # jaccard similarity calculation
  intersection <- length(intersect(set_A, set_B))
  union <- length(unique(c(set_A, set_B)))
  # return score
  return(intersection / union)}

# window of 5 turns
jaccard_similarity_5 <- function(turns_A, turns_B) {
  # create sets of turns within 3-turn window
  set_A <- unlist(lapply(turns_A, function(x) seq(x - 5, x + 5, by = 1)))
  set_B <- unlist(lapply(turns_B, function(x) seq(x - 5, x + 5, by = 1)))
  # jaccard similarity calculation
  intersection <- length(intersect(set_A, set_B))
  union <- length(unique(c(set_A, set_B)))
  # return score
  return(intersection / union)}

3 turns

# calculate jaccard similarity for each transcript
# and mean score + SD
jaccard_df |>
  group_by(transcript_id) |>
  expand(PID1 = PID, PID2 = PID) |>
  filter(PID1 != PID2) |>
  left_join(jaccard_df, by = c("transcript_id", "PID1" = "PID")) |>
  left_join(jaccard_df, by = c("transcript_id", "PID2" = "PID")) |>
  rowwise() |>
  mutate(jaccard_score = jaccard_similarity_3(topic_switch_turns.x, topic_switch_turns.y)) |>
  select(transcript_id, jaccard_score, PID1, PID2) |>
  ungroup() |>
  group_by(transcript_id) |>
  mutate(transcript_average_jaccard = mean(jaccard_score)) |>
  ungroup() |>
  summarize(mean_jaccard_score = mean(transcript_average_jaccard),
            sd_jaccard_score = sd(transcript_average_jaccard))
# A tibble: 1 × 2
  mean_jaccard_score sd_jaccard_score
               <dbl>            <dbl>
1              0.293           0.0600

moderate agreement between participants. there were some common marks but also some notable differences

5 turns

# calculate jaccard similarity for each transcript
# and mean score + SD
jaccard_df |>
  group_by(transcript_id) |>
  expand(PID1 = PID, PID2 = PID) |>
  filter(PID1 != PID2) |>
  left_join(jaccard_df, by = c("transcript_id", "PID1" = "PID")) |>
  left_join(jaccard_df, by = c("transcript_id", "PID2" = "PID")) |>
  rowwise() |>
  mutate(jaccard_score = jaccard_similarity_5(topic_switch_turns.x, topic_switch_turns.y)) |>
  select(transcript_id, jaccard_score, PID1, PID2) |>
  ungroup() |>
  group_by(transcript_id) |>
  mutate(transcript_average_jaccard = mean(jaccard_score)) |>
  ungroup() |>
  summarize(mean_jaccard_score = mean(transcript_average_jaccard),
            sd_jaccard_score = sd(transcript_average_jaccard))
# A tibble: 1 × 2
  mean_jaccard_score sd_jaccard_score
               <dbl>            <dbl>
1              0.293           0.0600

2. Transition Matrix

# load transcript data
all_pid_transcripts <- read.csv("./data/processed/all_participant_transcripts.csv")
# make all topics lowercase
all_pid_transcripts$new_topic <- tolower(all_pid_transcripts$new_topic)

# load clustered data
cluster_50 <- read.csv("./data/output/topic_clusters_50.csv")
# load cluster labels
cluster_labels <- read.csv("./data/output/topic_cluster_labels_50.csv")
# add cluster labels to clustered data
cluster_50 <- merge(cluster_50, cluster_labels, by = "clusters")
# select only relevant variables
cluster_50 <- cluster_50 |>
  select(PID, topic_order, new_topic, cluster_label)

# add clustered data to all_pid_transcripts
transcripts_clusters <- merge(all_pid_transcripts, cluster_50,
                              by = c("PID", "new_topic", "topic_order"),
                              all.x = TRUE)

# check that all observations are accounted for
anti_join(all_pid_transcripts, transcripts_clusters)
Joining with `by = join_by(turn_id, transcript_id, scaled_turn_id, speaker,
utterance, PID, new_topic, topic_order)`
[1] turn_id        transcript_id  scaled_turn_id speaker        utterance     
[6] PID            new_topic      topic_order   
<0 rows> (or 0-length row.names)
# remove data frames
rm(all_pid_transcripts, cluster_50, cluster_labels)
# create themes
# tag 50 clusters with the 10 cluster groups to give better insight into transitions
transcripts_clusters <- transcripts_clusters |>
  mutate(group_theme = case_when(
    # COVID
    cluster_label == "pandemic, quarantine" ~ "COVID-19",
    cluster_label == "masks" ~ "COVID-19",
    cluster_label == "covid" ~ "COVID-19",
    cluster_label == "health, illness" ~ "COVID-19",
    cluster_label == "zoom" ~ "COVID-19",
    # RELATIONSHIPS
    cluster_label == "family" ~ "relationships",
    cluster_label == "relationships" ~ "relationships",
    cluster_label == "parents" ~ "relationships",
    cluster_label == "kids, children" ~ "relationships",
    cluster_label == "people, fictional characters" ~ "relationships",
    # WORK
    cluster_label == "employment, job" ~ "work",
    cluster_label == "work" ~ "work",
    cluster_label == "career, profession" ~ "work",
    cluster_label == "money, economy" ~ "work",
    cluster_label == "future plans" ~ "work",
    # SCHOOL
    cluster_label == "school" ~ "school",
    cluster_label == "college" ~ "school",
    cluster_label == "classes, teaching" ~ "school",
    cluster_label == "question, discussion" ~ "school",
    cluster_label == "technology, computer" ~ "school",
    # RESEARCH
    cluster_label == "research, study" ~ "research",
    cluster_label == "prolific" ~ "research",
    cluster_label == "survey" ~ "research",
    cluster_label == "technical issue" ~ "research",
    cluster_label == "time" ~ "research",
    # INTERACTION
    cluster_label == "starting the call" ~ "interaction",
    cluster_label == "ending the call" ~ "interaction",
    cluster_label == "greeting" ~ "interaction",
    cluster_label == "goodbye" ~ "interaction",
    cluster_label == "introduction" ~ "interaction",
    cluster_label == "ending conversation" ~ "interaction",
    # PREFERENCES
    cluster_label == "pets" ~ "preferences",
    cluster_label == "travel" ~ "preferences",
    cluster_label == "mood, preferences" ~ "preferences",
    cluster_label == "personal experience" ~ "preferences",
    cluster_label == "age" ~ "preferences",
    # LOCATION
    cluster_label == "weather" ~ "location",
    cluster_label == "location" ~ "location",
    cluster_label == "living situation, home" ~ "location",
    cluster_label == "city, state, country" ~ "location",
    cluster_label == "holiday" ~ "location",
    # NEWS / POLITICS
    cluster_label == "crime, protest, government" ~ "news & politics",
    cluster_label == "news, social media" ~ "news & politics",
    cluster_label == "election, politics" ~ "news & politics",
    cluster_label == "natural disasters, climate" ~ "news & politics",
    # ENTERTAINMENT
    cluster_label == "music, instruments" ~ "entertainment",
    cluster_label == "television, movies" ~ "entertainment",
    cluster_label == "items, fashion, clothing" ~ "entertainment",
    cluster_label == "food, drink" ~ "entertainment",
    cluster_label == "hobbies, activites, sports" ~ "entertainment",
  ))
# get probabilities of transitioning between each cluster
transition_matrix <- transcripts_clusters |>
  ungroup() |>
  group_by(PID) |>
  arrange(turn_id, .by_group = TRUE) |>
  select(PID, cluster_label) |>
  distinct() |>
  mutate(current_topic_number = 1:n(),
         current_topic = cluster_label,
         prior_topic_number = lag(current_topic_number),
         prior_topic = lag(cluster_label)) |>
  na.omit() |>
  ungroup() |>
  group_by(prior_topic, current_topic) |>
  summarize(transition_count = n()) |>
  ungroup() |>
  group_by(prior_topic) |>
  mutate(prior_sum = sum(transition_count)) |>
  mutate(probability = transition_count/prior_sum)
`summarise()` has grouped output by 'prior_topic'. You can override using the
`.groups` argument.
# get probabilities of transitioning between each theme
theme_matrix <- transcripts_clusters |>
  ungroup() |>
  group_by(PID) |>
  arrange(turn_id, .by_group = TRUE) |>
  select(PID, group_theme) |>
  distinct() |>
  mutate(current_topic_number = 1:n(),
         current_topic = group_theme,
         prior_topic_number = lag(current_topic_number),
         prior_topic = lag(group_theme)) |>
  na.omit() |>
  ungroup() |>
  group_by(prior_topic, current_topic) |>
  summarize(transition_count = n()) |>
  ungroup() |>
  group_by(prior_topic) |>
  mutate(prior_sum = sum(transition_count)) |>
  mutate(probability = transition_count/prior_sum)
`summarise()` has grouped output by 'prior_topic'. You can override using the
`.groups` argument.

Plot Matrix

# highlight the highest probability of transitioning to current topic from each prior topic
top_prob <- transition_matrix |>
  group_by(prior_topic) |>
  mutate(highlight = probability == max(probability)) |>
  ungroup()

# plot!
ggplot(transition_matrix, aes(x = prior_topic, y = current_topic,
                           fill = probability)) +
  geom_tile(color = "white") + # gradient fill
  geom_tile(data = subset(top_prob, highlight), 
            aes(x = prior_topic, y = current_topic,
                fill = probability),
            color = "black", linewidth = 0.5) + # outline highlight
  # geom_text(data = subset(top_prob, highlight), 
  #           aes(x = prior_topic, y = current_topic, label = round(probability, digits = 2))) +
  labs(title = "Transition Matrix",
       x = "Prior Topic", y = "Current Topic", fill = "Probability") +
  theme_cowplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +
  scale_fill_gradientn(colors = c("white", "#e96052"))

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/transition_matrix.pdf",
#        width = 14,
#        height = 10,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/transition_matrix.png",
#        width = 14,
#        height = 10,
#        units = "in")
# highlight the highest probability of transitioning to current theme from each prior theme
top_prob_theme <- theme_matrix |>
  group_by(prior_topic) |>
  mutate(highlight = probability == max(probability)) |>
  ungroup()

# plot!
ggplot(theme_matrix, aes(x = prior_topic, y = current_topic,
                           fill = probability)) +
  geom_tile(color = "white") + # gradient fill
  geom_tile(data = subset(top_prob_theme, highlight), 
            aes(x = prior_topic, y = current_topic,
                fill = probability),
            color = "black", linewidth = 0.5) + # outline highlight
  geom_text(data = subset(top_prob_theme, highlight),
            aes(x = prior_topic, y = current_topic, label = round(probability, digits = 2))) +
  labs(title = "Transition Matrix",
       x = "Prior Topic", y = "Current Topic", fill = "Probability") +
  theme_cowplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +
  scale_fill_gradientn(colors = c("white", "#e96052"))

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/theme_transition_matrix.pdf",
#        width = 8,
#        height = 6,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/theme_transition_matrix.png",
#        width = 8,
#        height = 6,
#        units = "in")

Plot DAG

# make an edges data frame (edge list)
edges <- transition_matrix |>
  select(from = prior_topic, to = current_topic, probability) |>
  distinct()

# make a graph object form the edge list
graph <- as_tbl_graph(edges, directed = TRUE)

# add topic base rate as node attribute to graph
V(graph)$prior_sum <- transition_matrix |>
  select(prior_topic, prior_sum) |>
  distinct() |>
  pull(prior_sum)

# create scaled version of probability for visualization
E(graph)$weight <- transition_matrix$probability

# create edges_10 tibble
edges <- as_tibble(igraph::as_data_frame(graph, what = "edges"))

# map node names to each edge
edges <- edges |> mutate(edge_color = as.factor(from))
graph <- graph |>
  activate(edges) |>
  mutate(edge_color = edges$edge_color) |>
  filter(weight >= 0.05)

graph <- graph %>%
  mutate(edge_group = as.factor(seq_along(E(graph))))
# plot
set.seed(2)
ggraph(graph, layout = "circle") +
  geom_edge_arc(aes(width = weight,  color = edge_color, group = edge_group),
                check_overlap = TRUE, angle_calc = "along",
                start_cap = circle(0.05, "in"),
                end_cap = circle(0.05, "in"), curvature = 0.05) +
  geom_node_point(aes(size = prior_sum, color = name)) +
  geom_node_text(aes(label = name, color = name)) +
  scale_edge_width(range = c(0.1, 2)) +
  coord_fixed() +
  theme_void() +
  scale_edge_color_manual(values = met.brewer("Hiroshige", 50)) +
  scale_color_manual(values = met.brewer("Hiroshige", 50)) +
  theme(legend.position = "none")
Warning: The `curvature` argument of `geom_edge_arc()` is deprecated as of ggraph 2.0.0.
ℹ Please use the `strength` argument instead.

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/DAG_names.pdf",
#        width = 6,
#        height = 6,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/DAG_names.png",
#        width = 6,
#        height = 6,
#        units = "in")
# plot
set.seed(2)
ggraph(graph, layout = "circle") +
  geom_edge_arc(aes(width = weight,  color = edge_color, group = edge_group),
                check_overlap = TRUE, angle_calc = "along",
                start_cap = circle(0.05, "in"),
                end_cap = circle(0.05, "in"), curvature = 0.05) +
  geom_node_point(aes(size = prior_sum, color = name)) +
  scale_edge_width(range = c(0.1, 2)) +
  coord_fixed() +
  theme_void() +
  scale_edge_color_manual(values = met.brewer("Hiroshige", 50)) +
  scale_color_manual(values = met.brewer("Hiroshige", 50)) +
  theme(legend.position = "none")

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/DAG.pdf",
#        width = 6,
#        height = 6,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/DAG.png",
#        width = 6,
#        height = 6,
#        units = "in")
# now make a version with themes
# make an edges data frame (edge list)
edges <- theme_matrix |>
  select(from = prior_topic, to = current_topic, probability) |>
  distinct()

# make a graph object form the edge list
graph <- as_tbl_graph(edges, directed = TRUE)

# add topic base rate as node attribute to graph
V(graph)$prior_sum <- theme_matrix |>
  select(prior_topic, prior_sum) |>
  distinct() |>
  pull(prior_sum)

# create scaled version of probability for visualization
E(graph)$weight <- theme_matrix$probability

# create edges_10 tibble
edges <- as_tibble(igraph::as_data_frame(graph, what = "edges"))

# map node names to each edge
edges <- edges |> mutate(edge_color = as.factor(from))
graph <- graph |>
  activate(edges) |>
  mutate(edge_color = edges$edge_color) |>
  filter(weight >= 0.125)

graph <- graph %>%
  mutate(edge_group = as.factor(seq_along(E(graph))))
# plot
set.seed(2)
ggraph(graph, layout = "circle") +
  geom_edge_arc(aes(width = weight,  color = edge_color, group = edge_group),
                check_overlap = TRUE, angle_calc = "along",
                start_cap = circle(0.05, "in"),
                end_cap = circle(0.05, "in"), curvature = 0.05) +
  geom_node_point(aes(size = prior_sum, color = name)) +
  geom_node_text(aes(label = name, color = name)) +
  scale_edge_width(range = c(0.1, 2)) +
  coord_fixed() +
  theme_void() +
  scale_edge_color_manual(values = met.brewer("Hiroshige", 10)) +
  scale_color_manual(values = met.brewer("Hiroshige", 10)) +
  theme(legend.position = "none")

# plot
set.seed(2)
ggraph(graph, layout = "circle") +
  geom_edge_arc(aes(width = weight,  color = edge_color, group = edge_group),
                check_overlap = TRUE, angle_calc = "along",
                start_cap = circle(0.05, "in"),
                end_cap = circle(0.05, "in"), curvature = 0.05) +
  geom_node_point(aes(size = prior_sum, color = name)) +
  scale_edge_width(range = c(0.1, 2)) +
  coord_fixed() +
  theme_void() +
  scale_edge_color_manual(values = met.brewer("Hiroshige", 10)) +
  scale_color_manual(values = met.brewer("Hiroshige", 10)) +
  theme(legend.position = "none")

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/DAG_theme.pdf",
#        width = 6,
#        height = 6,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/DAG_theme.png",
#        width = 6,
#        height = 6,
#        units = "in")

Plot Ridges

What proportion of transcripts are in a given cluster topic at a given scaled time point?

# calculate occurrence of each cluster label at each scaled_turn_id across 200 conversations
# prop = occurrence / 200
topic_occ <- transcripts_clusters |>
  select(PID, transcript_id, cluster_label, scaled_turn_id, group_theme) |>
  distinct() |>
  group_by(transcript_id, scaled_turn_id, cluster_label, group_theme) |>
  # create binary indicator of whether a transcript contains the cluster label per turn
  summarize(label_present = as.integer(any(PID == PID)), .groups = "drop") |>
  ungroup() |>
  group_by(scaled_turn_id, cluster_label, group_theme) |>
  # count how many unique transcripts have each label per turn
  summarize(transcript_count = sum(label_present), .groups = "drop") |>
  ungroup() |>
  # now calculate proportion of transcript count out of total transcripts (200)
  mutate(prop = transcript_count / 200,
         scaled_turn_id = scaled_turn_id*100)

Make the plot colors match the theme grouping!

# plot
ggplot(topic_occ, aes(x = scaled_turn_id, 
                      y = fct_reorder(cluster_label, scaled_turn_id*prop, .fun = sum), 
                      height = prop, fill = group_theme)) + 
  geom_density_ridges(stat = "identity", scale = 4, rel_min_height = 0.005) +
  scale_x_continuous(expand = c(0, 0), limits = c(0,101)) +
  labs(y = "Cluster Label", x = "Conversation Completion (%)") +
  scale_fill_manual(values = met.brewer("Hiroshige", 10)) +
  theme_cowplot() +
  theme(legend.position = "none")

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/cluster_ridges.pdf",
#        width = 6,
#        height = 10,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/cluster_ridges.png",
#        width = 6,
#        height = 10,
#        units = "in")

Entropy

# 200 conversations, 200 topics at each bin point, cluster count = 200 for each time point
# Majority across participants who annotated that conversation; in case of tie pick randomly between them

# define empty data frame
transcript_majority_clusters <- data.frame()

# loop through each unique transcript, and figure out majority cluster label per time bin
for (i in unique(transcripts_clusters$transcript_id)) {
  # select just this loop's transcript from transcripts_clusters
  this_transcript <- transcripts_clusters |> filter(transcript_id == i)
  # now loop through each scaled integer bin 
  for (j in unique(this_transcript$scaled_turn_id)) {
    # select just this loop's bin
    this_bin <- this_transcript |> filter(scaled_turn_id == j)
    # count participant-level cluster labels
    participant_count <- this_bin |>
      group_by(PID) |>
      summarize(participant_cluster_label = unique(cluster_label)) |>
      ungroup() |>
      group_by(participant_cluster_label) |>
      mutate(cluster_count = length(participant_cluster_label)) |>
      # select only label and count
      select(participant_cluster_label, cluster_count) |>
      distinct() |>
      arrange(-cluster_count) |>
      ungroup()
    # take top row cluster if only one winner, if not, randomly take one of top labels if tie
    majority_cluster <- participant_count |>
      slice_max(order_by = cluster_count, n = 1) # may return more than 1 if tie
    # randomly sample from rows of majority cluster
    this_cluster_label <- sample(majority_cluster$participant_cluster_label, 1)
    # takes top row cluster label if only one majority cluster label
    # finally, save the bin, transcript ID, and cluster label for entropy analysis
    save <- data.frame(bin = j,
                       transcript_id = i,
                       majority_cluster_label = this_cluster_label)
    # rbind with transcript_majority_clusters to save for all time points and all transcripts
    transcript_majority_clusters <- rbind(transcript_majority_clusters, save)
  }
}
# now compute the percentage appearance of each majority cluster topic across transcripts per time bin
topic_prop_majority <- transcript_majority_clusters |>
  group_by(bin, majority_cluster_label) |>
  summarize(cluster_count = length(transcript_id)) |>
  ungroup() |>
  group_by(bin) |>
  mutate(bin_count = sum(cluster_count)) |>
  mutate(prop = cluster_count / bin_count) |>
  ungroup()
`summarise()` has grouped output by 'bin'. You can override using the `.groups`
argument.
# calculate entropy
topic_entropy <- topic_prop_majority |>
  group_by(bin) |>
  summarize(prop_entropy = entropy(prop)) |>
  mutate(bin = bin * 100)
# plot entropy
ggplot(topic_entropy, aes(x = bin, y = prop_entropy)) + 
  geom_line() +
  scale_x_continuous(expand = c(0, 0), limits = c(0,101)) +
  labs(y = "Entropy", x = "Conversation Completion (%)") +
  theme_cowplot() +
  theme(legend.position = "none")

# # save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/entropy.pdf",
#        width = 6,
#        height = 2,
#        units = "in")
# 
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/entropy.png",
#        width = 6,
#        height = 2,
#        units = "in")

3. Topic Label + Cluster Label + Utterance Similarity

# load similarity scores between cluster labels, participant labels, and transcript utterances
sim <- read.csv("./data/output/embedding_similarity.csv") |>
  select(PID, transcript_id, cluster_label, scaled_turn_id,
         topic_cluster_similarity, topic_utterance_similarity, cluster_utterance_similarity)
# get transcript-level average utterance + cluster label similarity
transcript_sim <- sim |>
  group_by(transcript_id, scaled_turn_id) |>
  summarize(mean_utterance_cluster_similarity = mean(cluster_utterance_similarity))
`summarise()` has grouped output by 'transcript_id'. You can override using the
`.groups` argument.
# now average across transcripts
transcript_sim_sum <- transcript_sim |>
  group_by(scaled_turn_id) |>
  summarize(mean_similarity = mean(mean_utterance_cluster_similarity),
            sd_similarity = sd(mean_utterance_cluster_similarity))
# plot
ggplot(transcript_sim_sum, aes(x = scaled_turn_id, y = mean_similarity)) +
  geom_ribbon(aes(ymin = mean_similarity - sd_similarity,
                  ymax = mean_similarity + sd_similarity),
              color = NA, alpha = 0.1) +
  geom_line() +
  theme_cowplot()

# get participant-level average utterance + topic/cluster label similarity
PID_sim <- sim |>
  group_by(PID, scaled_turn_id) |>
  summarize(mean_utterance_topic_similarity = mean(topic_utterance_similarity),
            mean_utterance_cluster_similarity = mean(cluster_utterance_similarity))
`summarise()` has grouped output by 'PID'. You can override using the `.groups`
argument.
# now average across PIDs
PID_sim_sum <- PID_sim |>
  group_by(scaled_turn_id) |>
  summarize(mean_topic_similarity = mean(mean_utterance_topic_similarity),
            sd_topic_similarity = sd(mean_utterance_topic_similarity),
            mean_cluster_similarity = mean(mean_utterance_cluster_similarity),
            sd_cluster_similarity = sd(mean_utterance_cluster_similarity))
# plot cluster vs topic lines
ggplot() +
  geom_ribbon(data = PID_sim_sum, aes(y = mean_topic_similarity, x = scaled_turn_id,
                  ymin = mean_topic_similarity - sd_topic_similarity,
                  ymax = mean_topic_similarity + sd_topic_similarity),
              fill = "blue", color = NA, alpha = 0.1) +
  geom_ribbon(data = PID_sim_sum, aes(y = mean_cluster_similarity, x = scaled_turn_id,
                  ymin = mean_cluster_similarity - sd_cluster_similarity,
                  ymax = mean_cluster_similarity + sd_cluster_similarity),
              fill = "red", color = NA, alpha = 0.1) +
  geom_line(data = PID_sim_sum, aes(x = scaled_turn_id, y = mean_topic_similarity), color = "blue") +
  geom_line(data = PID_sim_sum, aes(x = scaled_turn_id, y = mean_cluster_similarity), color = "red") +
  geom_line() +
  theme_cowplot()

# example participant
example_sim <- sim |>
  filter(PID == "[False, '542498adfdf99b691fb384d1', None]") |>
  select(scaled_turn_id, topic_utterance_similarity, cluster_label) |>
  distinct() |>
  arrange(scaled_turn_id) |>
  mutate(next_x = c(scaled_turn_id[-1],NA),
         next_y = c(topic_utterance_similarity[-1],NA),
         next_label = c(cluster_label[-1],NA)) |>
  na.omit()

# plot
ggplot(example_sim, aes(x = scaled_turn_id, y = topic_utterance_similarity,
                    xend = next_x, yend = next_y,
                    color = cluster_label)) +
  geom_point(alpha = 0) +
  geom_segment() +
  theme_cowplot() +
  theme(legend.position = "none")

4. UMAP

5. Tiling

# load tiled cosine similarity data
df_tile_10 <- read.csv("./data/output/annotated_transcripts_tile_10_0.csv")
df_tile_15 <- read.csv("./data/output/annotated_transcripts_tile_15_0.csv")
df_tile_20 <- read.csv("./data/output/annotated_transcripts_tile_20_0.csv")
# load annotation data 
df_ann <- read.csv("./data/processed/dense_subset_processed.csv") |>
  filter(!is.na(PID))
# within annotation data, mark participants as coarse, middle, or granular annotators based on the lower, middle, and upper thirds of annotations provided

# calculate number of annotations per participant
annotation_numbers <- df_ann |>
  dplyr::group_by(PID) |>
  summarize(total_PID_annotations = length(turn_id)) # should be 1505, number of participants

# also create mutated variable in df_ann
df_ann <- df_ann |>
  dplyr::group_by(PID) |>
  mutate(total_PID_annotations = length(turn_id)) |>
  ungroup()

# calculate quantiles to split number of PID annotations into three groups
quantiles <- quantile(annotation_numbers$total_PID_annotations,
                      probs = c(0, 1/3, 2/3, 1))

# create new variable in df_ann 
df_ann$annotation_behavior <- cut(df_ann$total_PID_annotations, breaks = quantiles,
                                  include.lowest = TRUE, 
                                  labels = c("coarse", "middle", "granular"))

# remove data not needed
rm(annotation_numbers, quantiles)

Create an annotation matching function. Examine annotation points for all participants and determine if a given tiled window contains the utterance they marked as signifying a topic shift in conversation. If the utterance they selected is within the window, mark it with “yes” and if not, mark it with a “no”. Do this for all participants separately.

# write a function to check if annotation turn ID is within a tiled window of utterances
detect_window_annotations <- function(tiling_df, annotation_df) {
  # save data frames for output
  annotation_output <- data.frame()
  # select one participant's annotations at a time
  for (a in unique(annotation_df$PID)) {
    # save PID
    this_PID <- a
    # subset annotation DF to just this participant's annotations
    this_annotation <- annotation_df |> filter(PID == a)
    # get corresponding transcript they annotated from tiling DF
    this_transcript_id <- unique(this_annotation$transcript_id)
    this_transcript <- tiling_df |> filter(transcript_id == this_transcript_id)
    # save PID annotation behavior
    behavior <- unique(this_annotation$annotation_behavior)
    # add new variables to this_transcript to hold this participant's annotated turn / label
    this_transcript$annotated_turn <- NA
    this_transcript$annotated_label <- NA
    # create a list of this participant's labeled topics and their turn IDs
    PID_labels <- this_annotation$new_topic
    PID_turns <- this_annotation$turn_id
    # save a version of this_transcript for looping through annotations
    annotations_result <- this_transcript
    # add participant ID and annotation behavior
    annotations_result$PID <- this_PID
    annotations_result$annotation_behavior <- behavior
    # 1a) does the gap turn (i.e., A_turn_end) == topic label turn selected?
    annotations_result$annotated_turn <- ifelse(annotations_result$A_end_turn %in% PID_turns,
                                                "yes", "no")
    # 1b) if yes, add the label provided by participants
    for (c in 1:length(PID_labels)) {
      annotations_result$annotated_label[annotations_result$A_end_turn == PID_turns[c]] <- PID_labels[c]
    }
    # add to annotation output data frame
    annotation_output <- rbind(annotation_output, annotations_result) 
  }
  return(annotation_output)
}

10 utterances

# apply function
tile_ann_10 <- detect_window_annotations(df_tile_10, df_ann)
# calculate distance from the human annotation within the tile data
ann_dist_10 <- tile_ann_10 |>
  dplyr::group_by(transcript_id) |>
  mutate(annotation_dist = case_when(annotated_turn == "yes" ~ 0,
                                     lag(annotated_turn, 1) == "yes" ~ 1, lag(annotated_turn, 2) == "yes" ~ 2,
                                     lag(annotated_turn, 3) == "yes" ~ 3, lag(annotated_turn, 4) == "yes" ~ 4,
                                     lag(annotated_turn, 5) == "yes" ~ 5, lag(annotated_turn, 6) == "yes" ~ 6,
                                     lag(annotated_turn, 7) == "yes" ~ 7, lag(annotated_turn, 8) == "yes" ~ 8,
                                     lag(annotated_turn, 9) == "yes" ~ 9, lag(annotated_turn, 10) == "yes" ~ 10,
                                     lag(annotated_turn, 11) == "yes" ~ 11, lag(annotated_turn, 12) == "yes" ~ 12,
                                     lag(annotated_turn, 13) == "yes" ~ 13, lag(annotated_turn, 14) == "yes" ~ 14,
                                     lag(annotated_turn, 15) == "yes" ~ 15, lag(annotated_turn, 16) == "yes" ~ 16,
                                     lag(annotated_turn, 17) == "yes" ~ 17, lag(annotated_turn, 18) == "yes" ~ 18,
                                     lag(annotated_turn, 19) == "yes" ~ 19, lag(annotated_turn, 20) == "yes" ~ 20,
                                     lag(annotated_turn, 21) == "yes" ~ 21, lag(annotated_turn, 22) == "yes" ~ 22,
                                     lag(annotated_turn, 23) == "yes" ~ 23, lag(annotated_turn, 24) == "yes" ~ 24,
                                     lag(annotated_turn, 25) == "yes" ~ 25, lag(annotated_turn, 26) == "yes" ~ 26,
                                     lag(annotated_turn, 27) == "yes" ~ 27, lag(annotated_turn, 28) == "yes" ~ 28,
                                     lag(annotated_turn, 29) == "yes" ~ 29, 
                                     lead(annotated_turn, 1) == "yes" ~ -1, lead(annotated_turn, 2) == "yes" ~ -2,
                                     lead(annotated_turn, 3) == "yes" ~ -3, lead(annotated_turn, 4) == "yes" ~ -4,
                                     lead(annotated_turn, 5) == "yes" ~ -5, lead(annotated_turn, 6) == "yes" ~ -6,
                                     lead(annotated_turn, 7) == "yes" ~ -7, lead(annotated_turn, 8) == "yes" ~ -8,
                                     lead(annotated_turn, 9) == "yes" ~ -9, lead(annotated_turn, 10) == "yes" ~ -10,
                                     lead(annotated_turn, 11) == "yes" ~ -11, lead(annotated_turn, 12) == "yes" ~ -12,
                                     lead(annotated_turn, 13) == "yes" ~ -13, lead(annotated_turn, 14) == "yes" ~ -14,
                                     lead(annotated_turn, 15) == "yes" ~ -15, lead(annotated_turn, 16) == "yes" ~ -16,
                                     lead(annotated_turn, 17) == "yes" ~ -17, lead(annotated_turn, 18) == "yes" ~ -18,
                                     lead(annotated_turn, 19) == "yes" ~ -19, lead(annotated_turn, 20) == "yes" ~ -20,
                                     lead(annotated_turn, 21) == "yes" ~ -21, lead(annotated_turn, 22) == "yes" ~ -22,
                                     lead(annotated_turn, 23) == "yes" ~ -23, lead(annotated_turn, 24) == "yes" ~ -24,
                                     lead(annotated_turn, 25) == "yes" ~ -25, lead(annotated_turn, 26) == "yes" ~ -26,
                                     lead(annotated_turn, 27) == "yes" ~ -27, lead(annotated_turn, 28) == "yes" ~ -28,
                                     lead(annotated_turn, 29) == "yes" ~ -29,
                                     .default = NA)) |>
  select(transcript_id, annotation_dist, cosine_similarity, PID, annotation_behavior) |>
  na.omit()
# get summary for annotation distance
dist_summary_10 <- summarySE(ann_dist_10, measurevar = "cosine_similarity", 
                               groupvars = c("annotation_dist", "annotation_behavior"))
# only include -15 to 15
dist_subset_10 <- dist_summary_10 |>
  filter(annotation_dist >= -15 & annotation_dist <= 15)

# plot
plot10 <- ggplot(data = dist_subset_10, aes(x = annotation_dist, y = cosine_similarity,
                                     color = annotation_behavior, fill = annotation_behavior)) +
  geom_vline(aes(xintercept = 0), color = "grey", linetype = "dashed") +
  geom_point(alpha = 0) +
  geom_line(linewidth = 2) +
  geom_ribbon(aes(ymin = cosine_similarity - se, ymax = cosine_similarity + se), 
              alpha = 0.25, color = NA) +
  labs(x = "Turn Distance from Annotation", y = "Cosine Similarity",
       title = "10 Adjacent Utterances") +
  scale_color_manual(values = c("#e96052", "#ffd16d", "#4f8dae"), name = 'Annotation\nBehavior') +
  scale_fill_manual(values = c("#e96052", "#ffd16d", "#4f8dae"), name = 'Annotation\nBehavior') +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 4)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
  theme_cowplot() +
  theme(plot.title = element_text(hjust = 0.5))

# show
plot10

15 utterances

# apply function
tile_ann_15 <- detect_window_annotations(df_tile_15, df_ann)
# calculate distance from the human annotation within the tile data
ann_dist_15 <- tile_ann_15 |>
  dplyr::group_by(transcript_id) |>
  mutate(annotation_dist = case_when(annotated_turn == "yes" ~ 0,
                                     lag(annotated_turn, 1) == "yes" ~ 1, lag(annotated_turn, 2) == "yes" ~ 2,
                                     lag(annotated_turn, 3) == "yes" ~ 3, lag(annotated_turn, 4) == "yes" ~ 4,
                                     lag(annotated_turn, 5) == "yes" ~ 5, lag(annotated_turn, 6) == "yes" ~ 6,
                                     lag(annotated_turn, 7) == "yes" ~ 7, lag(annotated_turn, 8) == "yes" ~ 8,
                                     lag(annotated_turn, 9) == "yes" ~ 9, lag(annotated_turn, 10) == "yes" ~ 10,
                                     lag(annotated_turn, 11) == "yes" ~ 11, lag(annotated_turn, 12) == "yes" ~ 12,
                                     lag(annotated_turn, 13) == "yes" ~ 13, lag(annotated_turn, 14) == "yes" ~ 14,
                                     lag(annotated_turn, 15) == "yes" ~ 15, lag(annotated_turn, 16) == "yes" ~ 16,
                                     lag(annotated_turn, 17) == "yes" ~ 17, lag(annotated_turn, 18) == "yes" ~ 18,
                                     lag(annotated_turn, 19) == "yes" ~ 19, lag(annotated_turn, 20) == "yes" ~ 20,
                                     lag(annotated_turn, 21) == "yes" ~ 21, lag(annotated_turn, 22) == "yes" ~ 22,
                                     lag(annotated_turn, 23) == "yes" ~ 23, lag(annotated_turn, 24) == "yes" ~ 24,
                                     lag(annotated_turn, 25) == "yes" ~ 25, lag(annotated_turn, 26) == "yes" ~ 26,
                                     lag(annotated_turn, 27) == "yes" ~ 27, lag(annotated_turn, 28) == "yes" ~ 28,
                                     lag(annotated_turn, 29) == "yes" ~ 29, 
                                     lead(annotated_turn, 1) == "yes" ~ -1, lead(annotated_turn, 2) == "yes" ~ -2,
                                     lead(annotated_turn, 3) == "yes" ~ -3, lead(annotated_turn, 4) == "yes" ~ -4,
                                     lead(annotated_turn, 5) == "yes" ~ -5, lead(annotated_turn, 6) == "yes" ~ -6,
                                     lead(annotated_turn, 7) == "yes" ~ -7, lead(annotated_turn, 8) == "yes" ~ -8,
                                     lead(annotated_turn, 9) == "yes" ~ -9, lead(annotated_turn, 10) == "yes" ~ -10,
                                     lead(annotated_turn, 11) == "yes" ~ -11, lead(annotated_turn, 12) == "yes" ~ -12,
                                     lead(annotated_turn, 13) == "yes" ~ -13, lead(annotated_turn, 14) == "yes" ~ -14,
                                     lead(annotated_turn, 15) == "yes" ~ -15, lead(annotated_turn, 16) == "yes" ~ -16,
                                     lead(annotated_turn, 17) == "yes" ~ -17, lead(annotated_turn, 18) == "yes" ~ -18,
                                     lead(annotated_turn, 19) == "yes" ~ -19, lead(annotated_turn, 20) == "yes" ~ -20,
                                     lead(annotated_turn, 21) == "yes" ~ -21, lead(annotated_turn, 22) == "yes" ~ -22,
                                     lead(annotated_turn, 23) == "yes" ~ -23, lead(annotated_turn, 24) == "yes" ~ -24,
                                     lead(annotated_turn, 25) == "yes" ~ -25, lead(annotated_turn, 26) == "yes" ~ -26,
                                     lead(annotated_turn, 27) == "yes" ~ -27, lead(annotated_turn, 28) == "yes" ~ -28,
                                     lead(annotated_turn, 29) == "yes" ~ -29,
                                     .default = NA)) |>
  select(transcript_id, annotation_dist, cosine_similarity, PID, annotation_behavior) |>
  na.omit()
# get summary for annotation distance
dist_summary_15 <- summarySE(ann_dist_15, measurevar = "cosine_similarity", 
                               groupvars = c("annotation_dist", "annotation_behavior"))
# only include -15 to 15
dist_subset_15 <- dist_summary_15 |>
  filter(annotation_dist >= -15 & annotation_dist <= 15)

# plot
plot15 <- ggplot(data = dist_subset_15, aes(x = annotation_dist, y = cosine_similarity,
                                     color = annotation_behavior, fill = annotation_behavior)) +
  geom_vline(aes(xintercept = 0), color = "grey", linetype = "dashed") +
  geom_point(alpha = 0) +
  geom_line(linewidth = 2) +
  geom_ribbon(aes(ymin = cosine_similarity - se, ymax = cosine_similarity + se), 
              alpha = 0.25, color = NA) +
  labs(x = "Turn Distance from Annotation", y = "Cosine Similarity",
       title = "15 Adjacent Utterances") +
  scale_color_manual(values = c("#e96052", "#ffd16d", "#4f8dae"), name = 'Annotation\nBehavior') +
  scale_fill_manual(values = c("#e96052", "#ffd16d", "#4f8dae"), name = 'Annotation\nBehavior') +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 4)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
  theme_cowplot() +
  theme(plot.title = element_text(hjust = 0.5))

# show
plot15

20 utterances

# apply function
tile_ann_20 <- detect_window_annotations(df_tile_20, df_ann)
# calculate distance from the human annotation within the tile data
ann_dist_20 <- tile_ann_20 |>
  dplyr::group_by(transcript_id) |>
  mutate(annotation_dist = case_when(annotated_turn == "yes" ~ 0,
                                     lag(annotated_turn, 1) == "yes" ~ 1, lag(annotated_turn, 2) == "yes" ~ 2,
                                     lag(annotated_turn, 3) == "yes" ~ 3, lag(annotated_turn, 4) == "yes" ~ 4,
                                     lag(annotated_turn, 5) == "yes" ~ 5, lag(annotated_turn, 6) == "yes" ~ 6,
                                     lag(annotated_turn, 7) == "yes" ~ 7, lag(annotated_turn, 8) == "yes" ~ 8,
                                     lag(annotated_turn, 9) == "yes" ~ 9, lag(annotated_turn, 10) == "yes" ~ 10,
                                     lag(annotated_turn, 11) == "yes" ~ 11, lag(annotated_turn, 12) == "yes" ~ 12,
                                     lag(annotated_turn, 13) == "yes" ~ 13, lag(annotated_turn, 14) == "yes" ~ 14,
                                     lag(annotated_turn, 15) == "yes" ~ 15, lag(annotated_turn, 16) == "yes" ~ 16,
                                     lag(annotated_turn, 17) == "yes" ~ 17, lag(annotated_turn, 18) == "yes" ~ 18,
                                     lag(annotated_turn, 19) == "yes" ~ 19, lag(annotated_turn, 20) == "yes" ~ 20,
                                     lag(annotated_turn, 21) == "yes" ~ 21, lag(annotated_turn, 22) == "yes" ~ 22,
                                     lag(annotated_turn, 23) == "yes" ~ 23, lag(annotated_turn, 24) == "yes" ~ 24,
                                     lag(annotated_turn, 25) == "yes" ~ 25, lag(annotated_turn, 26) == "yes" ~ 26,
                                     lag(annotated_turn, 27) == "yes" ~ 27, lag(annotated_turn, 28) == "yes" ~ 28,
                                     lag(annotated_turn, 29) == "yes" ~ 29, 
                                     lead(annotated_turn, 1) == "yes" ~ -1, lead(annotated_turn, 2) == "yes" ~ -2,
                                     lead(annotated_turn, 3) == "yes" ~ -3, lead(annotated_turn, 4) == "yes" ~ -4,
                                     lead(annotated_turn, 5) == "yes" ~ -5, lead(annotated_turn, 6) == "yes" ~ -6,
                                     lead(annotated_turn, 7) == "yes" ~ -7, lead(annotated_turn, 8) == "yes" ~ -8,
                                     lead(annotated_turn, 9) == "yes" ~ -9, lead(annotated_turn, 10) == "yes" ~ -10,
                                     lead(annotated_turn, 11) == "yes" ~ -11, lead(annotated_turn, 12) == "yes" ~ -12,
                                     lead(annotated_turn, 13) == "yes" ~ -13, lead(annotated_turn, 14) == "yes" ~ -14,
                                     lead(annotated_turn, 15) == "yes" ~ -15, lead(annotated_turn, 16) == "yes" ~ -16,
                                     lead(annotated_turn, 17) == "yes" ~ -17, lead(annotated_turn, 18) == "yes" ~ -18,
                                     lead(annotated_turn, 19) == "yes" ~ -19, lead(annotated_turn, 20) == "yes" ~ -20,
                                     lead(annotated_turn, 21) == "yes" ~ -21, lead(annotated_turn, 22) == "yes" ~ -22,
                                     lead(annotated_turn, 23) == "yes" ~ -23, lead(annotated_turn, 24) == "yes" ~ -24,
                                     lead(annotated_turn, 25) == "yes" ~ -25, lead(annotated_turn, 26) == "yes" ~ -26,
                                     lead(annotated_turn, 27) == "yes" ~ -27, lead(annotated_turn, 28) == "yes" ~ -28,
                                     lead(annotated_turn, 29) == "yes" ~ -29,
                                     .default = NA)) |>
  select(transcript_id, annotation_dist, cosine_similarity, PID, annotation_behavior) |>
  na.omit()
# get summary for annotation distance
dist_summary_20 <- summarySE(ann_dist_20, measurevar = "cosine_similarity", 
                               groupvars = c("annotation_dist", "annotation_behavior"))
# only include -15 to 15
dist_subset_20 <- dist_summary_20 |>
  filter(annotation_dist >= -15 & annotation_dist <= 15)

# plot
plot20 <- ggplot(data = dist_subset_20, aes(x = annotation_dist, y = cosine_similarity,
                                     color = annotation_behavior, fill = annotation_behavior)) +
  geom_vline(aes(xintercept = 0), color = "grey", linetype = "dashed") +
  geom_point(alpha = 0) +
  geom_line(linewidth = 2) +
  geom_ribbon(aes(ymin = cosine_similarity - se, ymax = cosine_similarity + se), 
              alpha = 0.25, color = NA) +
  labs(x = "Turn Distance from Annotation", y = "Cosine Similarity",
       title = "20 Adjacent Utterances") +
  scale_color_manual(values = c("#e96052", "#ffd16d", "#4f8dae"), name = 'Annotation\nBehavior') +
  scale_fill_manual(values = c("#e96052", "#ffd16d", "#4f8dae"), name = 'Annotation\nBehavior') +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 4)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
  theme_cowplot() +
  theme(plot.title = element_text(hjust = 0.5))

# show
plot20

Combined

# create faceted image
combined <- plot10 + plot15 + plot20 +
  plot_layout(widths = c(1,1,1), guides = "collect") +
  plot_annotation(tag_levels = "A") &
  theme(plot.tag = element_text(size = 20, face = "bold", family = "sans"))

# save combined figure
ggsave(plot = combined,
       filename = "./cogsci/figures/tile-combined.pdf",
       dpi = 300,
       units = "in",
       width = 16,
       heigh = 6)

ggsave(plot = combined,
       filename = "./cogsci/figures/tile-combined.png",
       dpi = 300,
       units = "in",
       width = 16,
       heigh = 6)

# show combined figure
combined

6. Illustrative Example

# one participant's annotation locations

# example transcript
transcript <- "f81a3aa9-3cb3-4df0-ba18-9b1f6f19e5ba"
# example PID
PID_coarse <- "[False, '66bbc95713288ad8d1e8578d', None]"
PID_granular <- "[False, '6724d4dc0e4e106b64f4dd4b', None]"

# COARSE
example_coarse <- transcripts_clusters |>
  filter(transcript_id == transcript & PID == PID_coarse) |>
  select(PID, scaled_turn_id, cluster_label, new_topic)
# get similarity data
example_coarse <- merge(example_coarse, sim, by = c("PID", "scaled_turn_id", "cluster_label")) |>
  group_by(scaled_turn_id, cluster_label, new_topic) |>
  summarize(avg_topic_utterance_similarity = mean(topic_utterance_similarity),
            avg_topic_cluster_similarity = mean(topic_cluster_similarity),
            avg_cluster_utterance_similarity = mean(cluster_utterance_similarity))
`summarise()` has grouped output by 'scaled_turn_id', 'cluster_label'. You can
override using the `.groups` argument.
# tag annotation switches
example_coarse <- example_coarse |>
  ungroup() |>
  arrange(scaled_turn_id) |>
  mutate(switch_topic = ifelse(new_topic != lag(new_topic), 1, NA),
         switch_cluster = ifelse(cluster_label != lag(cluster_label), 1, NA))

# GRANULAR
example_granular <- transcripts_clusters |>
  filter(transcript_id == transcript & PID == PID_granular) |>
  select(PID, scaled_turn_id, cluster_label, new_topic)
# get similarity data
example_granular <- merge(example_granular, sim, by = c("PID", "scaled_turn_id", "cluster_label")) |>
  group_by(scaled_turn_id, cluster_label, new_topic) |>
  summarize(avg_topic_utterance_similarity = mean(topic_utterance_similarity),
            avg_topic_cluster_similarity = mean(topic_cluster_similarity),
            avg_cluster_utterance_similarity = mean(cluster_utterance_similarity))
`summarise()` has grouped output by 'scaled_turn_id', 'cluster_label'. You can
override using the `.groups` argument.
# tag annotation switches
example_granular <- example_granular |>
  ungroup() |>
  arrange(scaled_turn_id) |>
  mutate(switch_topic = ifelse(new_topic != lag(new_topic), 1, NA),
         switch_cluster = ifelse(cluster_label != lag(cluster_label), 1, NA))
# plot
ggplot() +
  geom_tile(data = example_coarse, aes(x = scaled_turn_id*100, y = 0.5, fill = cluster_label)) +
  geom_line(data = example_coarse, aes(x = scaled_turn_id*100, y = avg_topic_utterance_similarity),
            linetype = "solid") +
  geom_line(data = example_coarse, aes(x = scaled_turn_id*100, y = avg_topic_cluster_similarity),
            linetype = "dashed") +
  geom_line(data = example_coarse, aes(x = scaled_turn_id*100, y = avg_cluster_utterance_similarity),
            linetype = "dotdash", color = "black") +
  # geom_vline(data = example_coarse[!is.na(example_coarse$switch_cluster),],
  #            aes(xintercept = (scaled_turn_id*100)-0.5, color = cluster_label),
  #            color = "white", linetype = "dotted") +
  theme_cowplot() +
  scale_fill_manual(values=met.brewer("Hiroshige", 7)) +
  labs(x = "Conversation Completion (%)", y = NULL,
       title = "Coarse Annotator") +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5))

# plot
ggplot() +
  geom_tile(data = example_granular, aes(x = scaled_turn_id*100, y = 0.5, fill = cluster_label)) +
  geom_line(data = example_granular, aes(x = scaled_turn_id*100, y = avg_topic_utterance_similarity),
            linetype = "solid") +
  geom_line(data = example_granular, aes(x = scaled_turn_id*100, y = avg_topic_cluster_similarity),
            linetype = "dashed") +
  geom_line(data = example_granular, aes(x = scaled_turn_id*100, y = avg_cluster_utterance_similarity),
            linetype = "dotdash", color = "black") +
  theme_cowplot() +
  scale_fill_manual(values=met.brewer("Hiroshige", 15)) +
  labs(x = "Conversation Completion (%)", y = NULL,
       title = "Granular Annotator") +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(-0.1, 0), limits = c(-0.1,1.2)) +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5))

# get tiled semantic similarity from this transcript only
example_tile <- read.csv("./data/output/annotated_transcripts_tile_10_0.csv") |>
  filter(transcript_id == transcript)
example_annotation <- read.csv("./data/processed/dense_subset_processed.csv") |>
  filter(PID == PID_coarse)
# apply function to see where annotations fall relative to transcript
example_tile <- detect_window_annotations(example_tile, example_annotation)

# match with cluster labels
example_cluster <- transcripts_clusters |>
  filter(PID == PID_coarse) |>
  select(cluster_label, "annotated_label" = new_topic) |>
  distinct()

# merge
example_tile <- merge(example_tile, example_cluster, by = 'annotated_label', all.x = TRUE)
# plot
ggplot(data = example_tile, aes(x = A_start_turn, y = cosine_similarity)) +
  geom_line() +
  geom_vline(data = example_tile[!is.na(example_tile$annotated_label),],
             aes(xintercept = A_start_turn, color = cluster_label), linetype = "dashed") +
  theme_void() +
  theme(legend.position = "none")

# save
# ggsave("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/cogsci/figures/example_cosine_similarity.pdf",
#        width = 15,
#        height = 2,
#        units = "in")
transcripts_clusters |>
  ungroup() |>
  dplyr::group_by(cluster_label, group_theme) |>
  summarize(cluster_count = length(scaled_turn_id)) |>
  arrange(desc(cluster_count))
`summarise()` has grouped output by 'cluster_label'. You can override using the
`.groups` argument.
# A tibble: 50 × 3
# Groups:   cluster_label [50]
   cluster_label                group_theme     cluster_count
   <chr>                        <chr>                   <int>
 1 covid                        COVID-19                23489
 2 mood, preferences            preferences             23133
 3 employment, job              work                    20725
 4 crime, protest, government   news & politics         19891
 5 technology, computer         school                  18131
 6 hobbies, activites, sports   entertainment           17276
 7 people, fictional characters relationships           17061
 8 items, fashion, clothing     entertainment           16907
 9 money, economy               work                    16268
10 personal experience          preferences             16050
# ℹ 40 more rows

——————-


ARCHIVE